################################################################################
# 03_create_descriptive_tables.R
# Generate Descriptive Tables 1, 2, and 3 for JHR Paper
#
# Tables Generated:
#   - Table 1: State Descriptive Information
#   - Table 2: Summary Statistics
#   - Table 3: Testing Standards (PPST z-scores)
#
# Inputs:
#   - data/cleaned/ets_treatment_data.xlsx
#   - data/cleaned/enrollment_event_data.xlsx (if available)
#   - data/cleaned/graduation_event_data.xlsx (if available)
#   - data/raw/ets/cleaned ppst and core and composite.dta
#
# Outputs:
#   - output/tables/table_1_state_info.xlsx
#   - output/tables/table_2_summary_statistics.xlsx
#   - output/tables/table_3_ppst_zscores.xlsx
################################################################################

library(tidyverse)
library(readxl)
library(writexl)
library(haven)

# Output directory
out_dir <- "output/tables"
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)

cat("\n========================================\n")
cat("Creating Descriptive Tables 1, 2, and 3\n")
cat("========================================\n\n")

# ──────────────────────────────────────────────────────────────────────────────
# Load Raw ETS Data for Table 3
# ──────────────────────────────────────────────────────────────────────────────

cat("Loading ETS data...\n")

# Load additional states (ND, OR, TN)
additional_states_file <- "data/raw/ets/states_we_add_back_in.xlsx"
if (file.exists(additional_states_file)) {
  additional_states <- read_excel(additional_states_file) %>%
    mutate(z_score = (passingscore - test_mean) / test_sd) %>%
    mutate(ID = as.character(ID))
} else {
  additional_states <- tibble()
}

# Load main ETS data
ets_file <- "data/raw/ets/cleaned ppst and core and composite.dta"
if (file.exists(ets_file)) {
  ets_raw <- read_stata(ets_file) %>%
    select(-any_of("eftotlt")) %>%
    bind_rows(additional_states) %>%
    mutate(z_score = (passingscore - test_mean) / test_sd)
} else {
  stop("ETS data file not found: ", ets_file)
}

# ──────────────────────────────────────────────────────────────────────────────
# TABLE 1: State Descriptive Information
# ──────────────────────────────────────────────────────────────────────────────

cat("\nCreating Table 1: State Descriptive Information...\n")

# Define states that used PPST
ppst_states <- c("AK", "AR", "CT", "DC", "DE", "HI", "IN", "LA", "ME", "MD",
                 "MS", "NE", "NV", "NH", "NJ", "NC", "ND", "OK", "OR", "PA",
                 "SC", "TN", "VT", "VA", "WV", "WI")

# Define Core adoption dates
core_adoption <- tribble(
  ~State, ~Core_Adoption,
  "AK", "Fall 2014",
  "AR", "Fall 2013",
  "CT", "Fall 2014",
  "DC", "Fall 2013",
  "DE", "Spring 2014",
  "HI", "Fall 2013",
  "IN", NA_character_,
  "IA", NA_character_,
  "KY", "Fall 2014",
  "LA", "Fall 2014",
  "ME", "Fall 2013",
  "MD", "Fall 2014",
  "MS", "Fall 2013",
  "NE", "Fall 2014",
  "NV", "Fall 2013",
  "NH", "Fall 2013",
  "NJ", "Fall 2013",
  "NC", "Fall 2013",
  "ND", "Fall 2014",
  "OK", "+",
  "OR", "Spring 2014",
  "PA", "Fall 2014",
  "RI", NA_character_,
  "SC", "Fall 2013",
  "TN", "Fall 2013",
  "VT", "Fall 2013",
  "VA", "Fall 2013",
  "WA", "Fall 2014",
  "WV", "Fall 2013",
  "WI", "Fall 2014"
)

# Define which states adopted recommended pass scores
adopted_scores <- c("AK", "AR", "CT", "DC", "DE", "HI", "LA", "ME", "MD",
                    "MS", "NE", "NV", "NH", "NJ", "NC", "OR", "PA", "SC",
                    "TN", "VT", "VA", "WV", "WI", "KY", "RI")

# Define panel participants
panel_participants <- c("AK", "AR", "CT", "DC", "DE", "HI", "KY", "LA", "ME",
                        "MD", "MS", "NE", "NH", "NJ", "NC", "ND", "PA", "RI",
                        "SC", "TN", "VT", "WV", "WI")

# Define sample states
sample_states <- c("AK", "AR", "CT", "DC", "DE", "HI", "LA", "ME", "MD", "MS",
                   "NE", "NV", "NH", "NJ", "NC", "OR", "PA", "SC", "VT", "VA",
                   "WV", "WI")

# Define shrinking states (school-age pop decreased 2008-2018)
# Note: DC is included here to match the published paper's Table 1.
# The regression code (06_main_regressions.R) excludes DC from shrinking
# subsamples because pop_percentage_change > 0 for DC in the regression data.
shrinking_states <- c("CT", "DC", "LA", "ME", "MS", "NH", "NJ", "PA", "VT", "WV", "WI")

# Create Table 1
table_1 <- tibble(
  State = c("AK", "AR", "CT", "DC", "DE", "HI", "IN", "IA", "KY", "LA",
            "ME", "MD", "MS", "NE", "NV", "NH", "NJ", "NC", "ND", "OK",
            "OR", "PA", "RI", "SC", "TN", "VT", "VA", "WA", "WV", "WI")
) %>%
  mutate(
    PPST = if_else(State %in% ppst_states, "yes", "no"),
    `Core Adoption Date` = core_adoption$Core_Adoption[match(State, core_adoption$State)],
    `Adopted Pass Scores` = case_when(
      State == "CT" ~ "yes*",
      State == "ME" ~ "yes*",
      State == "PA" ~ "yes*",
      State == "SC" ~ "yes*",
      State == "ND" ~ "no*",
      State %in% adopted_scores ~ "yes",
      TRUE ~ "no"
    ),
    `Panel Participant` = if_else(State %in% panel_participants, "yes", "no"),
    `In Sample` = if_else(State %in% sample_states, "yes", "no"),
    `Shrinking State` = case_when(
      !State %in% sample_states ~ "n/a",
      State %in% shrinking_states ~ "yes",
      TRUE ~ "no"
    )
  )

write_xlsx(table_1, file.path(out_dir, "table_1_state_info.xlsx"))
cat("  Saved: table_1_state_info.xlsx\n")
cat("  States in sample: ", sum(table_1$`In Sample` == "yes"), "\n")

# ──────────────────────────────────────────────────────────────────────────────
# TABLE 3: Testing Standards (PPST z-scores and delta TDI)
# ──────────────────────────────────────────────────────────────────────────────

cat("\nCreating Table 3: Testing Standards (PPST z-scores)...\n")

# Get PPST data for 2012 (last year of PPST)
# Note: In the data, time == "old" represents PPST, time == "new" represents Praxis Core
ppst_data <- ets_raw %>%
  filter(year == 2012, time == "old") %>%
  select(State, subject, passingscore, test_mean, test_sd, z_score) %>%
  mutate(subject = recode(subject, "read" = "reading", "write" = "writing"))

# States with composite score loopholes
composite_states <- c("HI", "MD", "ME", "NH", "PA", "VA", "VT")

# Adjust passing scores for composite states (subtract 3 points)
ppst_data <- ppst_data %>%
  mutate(
    passingscore_adj = if_else(State %in% composite_states,
                                passingscore - 3,
                                passingscore),
    z_score_adj = (passingscore_adj - test_mean) / test_sd
  )

# Reshape to wide format
ppst_wide <- ppst_data %>%
  select(State, subject, passingscore_adj, z_score_adj) %>%
  pivot_wider(
    names_from = subject,
    values_from = c(passingscore_adj, z_score_adj)
  )

# Get Praxis Core data for 2014 (first full year)
# Note: time == "new" represents Praxis Core
core_data <- ets_raw %>%
  filter(year == 2014, time == "new") %>%
  select(State, subject, passingscore, test_mean, test_sd) %>%
  mutate(z_score = (passingscore - test_mean) / test_sd)

# Calculate TDI for PPST (average of 3 z-scores)
ppst_tdi <- ppst_data %>%
  group_by(State) %>%
  summarize(
    TDI_PPST = mean(z_score_adj, na.rm = TRUE),
    .groups = "drop"
  )

# Calculate TDI for Core (should be same for all states ~-0.42)
core_tdi <- core_data %>%
  group_by(State) %>%
  summarize(
    TDI_Core = mean(z_score, na.rm = TRUE),
    .groups = "drop"
  )

# Calculate delta TDI
delta_tdi <- ppst_tdi %>%
  left_join(core_tdi, by = "State") %>%
  mutate(delta_TDI = TDI_Core - TDI_PPST)

# Create final Table 3
table_3 <- ppst_wide %>%
  left_join(delta_tdi, by = "State") %>%
  filter(State %in% sample_states) %>%
  mutate(
    State = if_else(State %in% composite_states, paste0(State, "*"), State)
  ) %>%
  select(
    State,
    `Math Raw Score` = passingscore_adj_math,
    `Math Z-Score` = z_score_adj_math,
    `Reading Raw Score` = passingscore_adj_reading,
    `Reading Z-Score` = z_score_adj_reading,
    `Writing Raw Score` = passingscore_adj_writing,
    `Writing Z-Score` = z_score_adj_writing,
    TDI = TDI_PPST,
    `Delta TDI` = delta_TDI
  ) %>%
  arrange(State) %>%
  # Add averages row
  bind_rows(
    tibble(
      State = "Average",
      `Math Raw Score` = mean(ppst_wide$passingscore_adj_math[ppst_wide$State %in% sample_states], na.rm = TRUE),
      `Math Z-Score` = mean(ppst_wide$z_score_adj_math[ppst_wide$State %in% sample_states], na.rm = TRUE),
      `Reading Raw Score` = mean(ppst_wide$passingscore_adj_reading[ppst_wide$State %in% sample_states], na.rm = TRUE),
      `Reading Z-Score` = mean(ppst_wide$z_score_adj_reading[ppst_wide$State %in% sample_states], na.rm = TRUE),
      `Writing Raw Score` = mean(ppst_wide$passingscore_adj_writing[ppst_wide$State %in% sample_states], na.rm = TRUE),
      `Writing Z-Score` = mean(ppst_wide$z_score_adj_writing[ppst_wide$State %in% sample_states], na.rm = TRUE),
      TDI = mean(delta_tdi$TDI_PPST[delta_tdi$State %in% sample_states], na.rm = TRUE),
      `Delta TDI` = mean(delta_tdi$delta_TDI[delta_tdi$State %in% sample_states], na.rm = TRUE)
    )
  )

# Round numeric columns
table_3 <- table_3 %>%
  mutate(across(where(is.numeric), ~round(., 2)))

write_xlsx(table_3, file.path(out_dir, "table_3_ppst_zscores.xlsx"))
cat("  Saved: table_3_ppst_zscores.xlsx\n")
cat("  Average Delta TDI: ", round(mean(delta_tdi$delta_TDI[delta_tdi$State %in% sample_states], na.rm = TRUE), 2), "\n")

# ──────────────────────────────────────────────────────────────────────────────
# TABLE 2: Summary Statistics
# ──────────────────────────────────────────────────────────────────────────────

cat("\nCreating Table 2: Summary Statistics...\n")

# Try to load enrollment/graduation data if available
enrollment_file <- "data/cleaned/enrollment_event_data.xlsx"
graduation_file <- "data/cleaned/graduation_event_data.xlsx"

if (file.exists(enrollment_file) && file.exists(graduation_file)) {

  enrollment_data <- read_excel(enrollment_file) %>%
    filter(!State %in% c("ND", "TN"))

  graduation_data <- read_excel(graduation_file) %>%
    filter(!State %in% c("ND", "TN"))

  # Define selectivity based on SAT/ACT scores as of 2010
  # Per paper Table 2 note (p37): "More selective universities are defined as
  # those with SAT/ACT scores above the in-sample median as of 2010."
  # Uses strictly greater (>) to match 06_main_regressions.R definition.
  enroll_2010 <- enrollment_data %>% filter(year == 2010)
  med_sat_enroll <- median(enroll_2010$satvr25_2, na.rm = TRUE)
  med_act_enroll <- median(enroll_2010$actcm25_2, na.rm = TRUE)

  enrollment_data <- enrollment_data %>%
    mutate(
      selective = case_when(
        is.na(satvr25_2) & is.na(actcm25_2) ~ "Less Selective",
        (!is.na(satvr25_2) & satvr25_2 > med_sat_enroll) |
          (!is.na(actcm25_2) & actcm25_2 > med_act_enroll) ~ "More Selective",
        TRUE ~ "Less Selective"
      )
    )

  grad_2010 <- graduation_data %>% filter(year == 2010)
  med_sat_grad <- median(grad_2010$satvr25_2, na.rm = TRUE)
  med_act_grad <- median(grad_2010$actcm25_2, na.rm = TRUE)

  graduation_data <- graduation_data %>%
    mutate(
      selective = case_when(
        is.na(satvr25_2) & is.na(actcm25_2) ~ "Less Selective",
        (!is.na(satvr25_2) & satvr25_2 > med_sat_grad) |
          (!is.na(actcm25_2) & actcm25_2 > med_act_grad) ~ "More Selective",
        TRUE ~ "Less Selective"
      )
    )

  # Calculate summary statistics
  # Panel A: Outcomes
  panel_a <- tibble(
    Variable = c("Education fall enrollments", "",
                 "White education fall enrollments", "",
                 "Non-white education fall enrollments", "",
                 "Teacher preparation graduates", "",
                 "White teacher preparation graduates", "",
                 "Non-white teacher preparation graduates", ""),
    All = c(
      round(mean(enrollment_data$eftotlt, na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$eftotlt, na.rm = TRUE), 0), ")"),
      round(mean(enrollment_data$efwhitt, na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$efwhitt, na.rm = TRUE), 0), ")"),
      round(mean(enrollment_data$non_white, na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$non_white, na.rm = TRUE), 0), ")"),
      round(mean(graduation_data$ctotalt, na.rm = TRUE), 0),
      paste0("(", round(sd(graduation_data$ctotalt, na.rm = TRUE), 0), ")"),
      round(mean(graduation_data$cwhitt, na.rm = TRUE), 0),
      paste0("(", round(sd(graduation_data$cwhitt, na.rm = TRUE), 0), ")"),
      round(mean(graduation_data$cnonwhite, na.rm = TRUE), 0),
      paste0("(", round(sd(graduation_data$cnonwhite, na.rm = TRUE), 0), ")")
    ),
    `More Selective` = c(
      round(mean(enrollment_data$eftotlt[enrollment_data$selective == "More Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$eftotlt[enrollment_data$selective == "More Selective"], na.rm = TRUE), 0), ")"),
      round(mean(enrollment_data$efwhitt[enrollment_data$selective == "More Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$efwhitt[enrollment_data$selective == "More Selective"], na.rm = TRUE), 0), ")"),
      round(mean(enrollment_data$non_white[enrollment_data$selective == "More Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$non_white[enrollment_data$selective == "More Selective"], na.rm = TRUE), 0), ")"),
      round(mean(graduation_data$ctotalt[graduation_data$selective == "More Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(graduation_data$ctotalt[graduation_data$selective == "More Selective"], na.rm = TRUE), 0), ")"),
      round(mean(graduation_data$cwhitt[graduation_data$selective == "More Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(graduation_data$cwhitt[graduation_data$selective == "More Selective"], na.rm = TRUE), 0), ")"),
      round(mean(graduation_data$cnonwhite[graduation_data$selective == "More Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(graduation_data$cnonwhite[graduation_data$selective == "More Selective"], na.rm = TRUE), 0), ")")
    ),
    `Less Selective` = c(
      round(mean(enrollment_data$eftotlt[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$eftotlt[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 0), ")"),
      round(mean(enrollment_data$efwhitt[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$efwhitt[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 0), ")"),
      round(mean(enrollment_data$non_white[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$non_white[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 0), ")"),
      round(mean(graduation_data$ctotalt[graduation_data$selective == "Less Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(graduation_data$ctotalt[graduation_data$selective == "Less Selective"], na.rm = TRUE), 0), ")"),
      round(mean(graduation_data$cwhitt[graduation_data$selective == "Less Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(graduation_data$cwhitt[graduation_data$selective == "Less Selective"], na.rm = TRUE), 0), ")"),
      round(mean(graduation_data$cnonwhite[graduation_data$selective == "Less Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(graduation_data$cnonwhite[graduation_data$selective == "Less Selective"], na.rm = TRUE), 0), ")")
    )
  )

  # Panel B: Time-varying Controls
  panel_b <- tibble(
    Variable = c("Total freshmen enrollment", "",
                 "SAT submission rate", "",
                 "ACT submission rate", "",
                 "SAT math 25th percentile score", "",
                 "SAT verbal 25th percentile score", "",
                 "ACT composite 25th percentile score", "",
                 "Pell grant (dollar amount, thousands)", "",
                 "Pell grant (% student)", "",
                 "Loan (dollar amount, thousands)", "",
                 "Loan (% student)", "",
                 "Number of universities"),
    All = c(
      round(mean(enrollment_data$enrollment_total2, na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$enrollment_total2, na.rm = TRUE), 0), ")"),
      round(mean(enrollment_data$satpct2, na.rm = TRUE), 1),
      paste0("(", round(sd(enrollment_data$satpct2, na.rm = TRUE), 1), ")"),
      round(mean(enrollment_data$actpct2, na.rm = TRUE), 1),
      paste0("(", round(sd(enrollment_data$actpct2, na.rm = TRUE), 1), ")"),
      round(mean(enrollment_data$satmt25_2, na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$satmt25_2, na.rm = TRUE), 1), ")"),
      round(mean(enrollment_data$satvr25_2, na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$satvr25_2, na.rm = TRUE), 0), ")"),
      round(mean(enrollment_data$actcm25_2, na.rm = TRUE), 1),
      paste0("(", round(sd(enrollment_data$actcm25_2, na.rm = TRUE), 1), ")"),
      round(mean(enrollment_data$pell_amount2, na.rm = TRUE) / 1000, 0),
      paste0("(", round(sd(enrollment_data$pell_amount2, na.rm = TRUE) / 1000, 0), ")"),
      round(mean(enrollment_data$pell_percent2, na.rm = TRUE), 1),
      paste0("(", round(sd(enrollment_data$pell_percent2, na.rm = TRUE), 1), ")"),
      round(mean(enrollment_data$loan_average2, na.rm = TRUE) / 1000, 0),
      paste0("(", round(sd(enrollment_data$loan_average2, na.rm = TRUE) / 1000, 0), ")"),
      round(mean(enrollment_data$loan_percent2, na.rm = TRUE), 1),
      paste0("(", round(sd(enrollment_data$loan_percent2, na.rm = TRUE), 1), ")"),
      n_distinct(enrollment_data$unitid)
    ),
    `More Selective` = c(
      round(mean(enrollment_data$enrollment_total2[enrollment_data$selective == "More Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$enrollment_total2[enrollment_data$selective == "More Selective"], na.rm = TRUE), 0), ")"),
      round(mean(enrollment_data$satpct2[enrollment_data$selective == "More Selective"], na.rm = TRUE), 1),
      paste0("(", round(sd(enrollment_data$satpct2[enrollment_data$selective == "More Selective"], na.rm = TRUE), 1), ")"),
      round(mean(enrollment_data$actpct2[enrollment_data$selective == "More Selective"], na.rm = TRUE), 1),
      paste0("(", round(sd(enrollment_data$actpct2[enrollment_data$selective == "More Selective"], na.rm = TRUE), 1), ")"),
      round(mean(enrollment_data$satmt25_2[enrollment_data$selective == "More Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$satmt25_2[enrollment_data$selective == "More Selective"], na.rm = TRUE), 1), ")"),
      round(mean(enrollment_data$satvr25_2[enrollment_data$selective == "More Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$satvr25_2[enrollment_data$selective == "More Selective"], na.rm = TRUE), 0), ")"),
      round(mean(enrollment_data$actcm25_2[enrollment_data$selective == "More Selective"], na.rm = TRUE), 1),
      paste0("(", round(sd(enrollment_data$actcm25_2[enrollment_data$selective == "More Selective"], na.rm = TRUE), 1), ")"),
      round(mean(enrollment_data$pell_amount2[enrollment_data$selective == "More Selective"], na.rm = TRUE) / 1000, 0),
      paste0("(", round(sd(enrollment_data$pell_amount2[enrollment_data$selective == "More Selective"], na.rm = TRUE) / 1000, 0), ")"),
      round(mean(enrollment_data$pell_percent2[enrollment_data$selective == "More Selective"], na.rm = TRUE), 1),
      paste0("(", round(sd(enrollment_data$pell_percent2[enrollment_data$selective == "More Selective"], na.rm = TRUE), 1), ")"),
      round(mean(enrollment_data$loan_average2[enrollment_data$selective == "More Selective"], na.rm = TRUE) / 1000, 0),
      paste0("(", round(sd(enrollment_data$loan_average2[enrollment_data$selective == "More Selective"], na.rm = TRUE) / 1000, 0), ")"),
      round(mean(enrollment_data$loan_percent2[enrollment_data$selective == "More Selective"], na.rm = TRUE), 1),
      paste0("(", round(sd(enrollment_data$loan_percent2[enrollment_data$selective == "More Selective"], na.rm = TRUE), 1), ")"),
      n_distinct(enrollment_data$unitid[enrollment_data$selective == "More Selective"])
    ),
    `Less Selective` = c(
      round(mean(enrollment_data$enrollment_total2[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$enrollment_total2[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 0), ")"),
      round(mean(enrollment_data$satpct2[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 1),
      paste0("(", round(sd(enrollment_data$satpct2[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 1), ")"),
      round(mean(enrollment_data$actpct2[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 1),
      paste0("(", round(sd(enrollment_data$actpct2[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 1), ")"),
      round(mean(enrollment_data$satmt25_2[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$satmt25_2[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 1), ")"),
      round(mean(enrollment_data$satvr25_2[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 0),
      paste0("(", round(sd(enrollment_data$satvr25_2[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 0), ")"),
      round(mean(enrollment_data$actcm25_2[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 1),
      paste0("(", round(sd(enrollment_data$actcm25_2[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 1), ")"),
      round(mean(enrollment_data$pell_amount2[enrollment_data$selective == "Less Selective"], na.rm = TRUE) / 1000, 0),
      paste0("(", round(sd(enrollment_data$pell_amount2[enrollment_data$selective == "Less Selective"], na.rm = TRUE) / 1000, 0), ")"),
      round(mean(enrollment_data$pell_percent2[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 1),
      paste0("(", round(sd(enrollment_data$pell_percent2[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 1), ")"),
      round(mean(enrollment_data$loan_average2[enrollment_data$selective == "Less Selective"], na.rm = TRUE) / 1000, 0),
      paste0("(", round(sd(enrollment_data$loan_average2[enrollment_data$selective == "Less Selective"], na.rm = TRUE) / 1000, 0), ")"),
      round(mean(enrollment_data$loan_percent2[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 1),
      paste0("(", round(sd(enrollment_data$loan_percent2[enrollment_data$selective == "Less Selective"], na.rm = TRUE), 1), ")"),
      n_distinct(enrollment_data$unitid[enrollment_data$selective == "Less Selective"])
    )
  )

  table_2 <- bind_rows(
    tibble(Variable = "Panel A: Outcomes", All = "", `More Selective` = "", `Less Selective` = ""),
    panel_a,
    tibble(Variable = "", All = "", `More Selective` = "", `Less Selective` = ""),
    tibble(Variable = "Panel B: Time-varying Controls", All = "", `More Selective` = "", `Less Selective` = ""),
    panel_b
  )

  write_xlsx(table_2, file.path(out_dir, "table_2_summary_statistics.xlsx"))
  cat("  Saved: table_2_summary_statistics.xlsx\n")
  cat("  Total universities: ", n_distinct(enrollment_data$unitid), "\n")

} else {
  cat("  Warning: Enrollment/graduation data not found. Skipping Table 2.\n")
  cat("  Run 04_merge_event_data.R and 05_create_graduation_data.R first.\n")

  # Create a placeholder Table 2 with structure from paper
  table_2_placeholder <- tibble(
    Variable = c("Panel A: Outcomes", "Education fall enrollments", "...",
                 "Panel B: Time-varying Controls", "Total freshmen enrollment", "..."),
    All = c("", "566 (769)", "...", "", "892 (1135)", "..."),
    `More Selective` = c("", "720 (762)", "...", "", "1259 (1379)", "..."),
    `Less Selective` = c("", "398 (74)", "...", "", "458 (545)", "...")
  )

  write_xlsx(table_2_placeholder, file.path(out_dir, "table_2_summary_statistics.xlsx"))
  cat("  Saved placeholder: table_2_summary_statistics.xlsx\n")
  cat("  Note: This is a placeholder. Run data cleaning scripts for full table.\n")
}

# ──────────────────────────────────────────────────────────────────────────────
# Summary
# ──────────────────────────────────────────────────────────────────────────────

cat("\n========================================\n")
cat("Descriptive Tables Summary\n")
cat("========================================\n")
cat("Created:\n")
cat("  - Table 1: State Descriptive Information\n")
cat("  - Table 2: Summary Statistics\n")
cat("  - Table 3: Testing Standards (PPST z-scores)\n")
cat("\nOutput location: ", out_dir, "\n")
